House Prices
House Prices
1 Objetivos
- Número de variáveis: 81
- Tipo de variáveis
- Inteiras ou discretas:
- Numéricas ou double
- Categóricas
- Qualitativas
- Qualidade dos dados
- Quantidade de NA’s por variável
- Criação de novas variáveis, se precisar
- Transformação das variáveis, se precisar
2 Conjunto de Dados de Teste
Iremos realizar nos dados de teste as mesmas transformações aplicadas aos dados de treino.
df.test <- data.table::fread('../dados/test.csv',
sep=",",
showProgress = FALSE) %>%
data.frame(stringsAsFactors = F)
df.test3 Separando o conjunto de dados de teste pelo tipo.
3.1 Dados tipo inteiro
test.int <- df.test[,unlist(lapply(df.test,class)) %in% "integer"]
test.intna amostra test.int também irei imputar dados.
apply(is.na(test.int),2,function(x) round(100*sum(as.numeric(x))/length(x),2)) %>%
sort(decreasing = T)## LotFrontage GarageYrBlt MasVnrArea BsmtFullBath BsmtHalfBath
## 15.56 5.35 1.03 0.14 0.14
## BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF GarageCars
## 0.07 0.07 0.07 0.07 0.07
## GarageArea Id MSSubClass LotArea OverallQual
## 0.07 0.00 0.00 0.00 0.00
## OverallCond YearBuilt YearRemodAdd X1stFlrSF X2ndFlrSF
## 0.00 0.00 0.00 0.00 0.00
## LowQualFinSF GrLivArea FullBath HalfBath BedroomAbvGr
## 0.00 0.00 0.00 0.00 0.00
## KitchenAbvGr TotRmsAbvGrd Fireplaces WoodDeckSF OpenPorchSF
## 0.00 0.00 0.00 0.00 0.00
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
## 0.00 0.00 0.00 0.00 0.00
## MoSold YrSold
## 0.00 0.00
4 Selecionando as variáveis
Removendo as mesmas variáveis do conjunto de treinamento.
cols.int <- readRDS('../outputs/cols.int.rds')
test.int <- test.int[,c('Id',cols.int)] %>%
select(-LotFrontage)preProcess_missingdata_model <- preProcess(test.int[,-1], method='knnImpute')
preProcess_missingdata_model## Created from 1365 samples and 27 variables
##
## Pre-processing:
## - centered (27)
## - ignored (0)
## - 5 nearest neighbor imputation (27)
## - scaled (27)
Vamos agora usar esse modelo para prever os valores ausentes df.int
test.int1 <- predict(preProcess_missingdata_model, newdata = test.int[,-1] )
anyNA(test.int1)## [1] FALSE
dados imputados com sucesso!
test.int <- data.frame(Id = test.int$Id,test.int1)4.1 Tranformando dados tipo string em categorical
test.fac <- df.test[,unlist(lapply(df.test,class)) %in% "character"] %>%
apply(2,as.factor) %>% data.frameremovendo as mesmas colunas do conjunto de treino.
test.fac <- test.fac %>% select(-PoolQC,-MiscFeature,
-Alley,-Fence,-FireplaceQu)
cols.fac <- readRDS('../outputs/cols.fac.rds')
test.fac <- test.fac[,cols.fac]verificando a porcentagem de valores nulos
apply(is.na(test.fac),2,function(x) round(100*sum(as.numeric(x))/length(x),2)) %>%
sort(decreasing = T)## GarageFinish GarageCond GarageType BsmtCond BsmtQual
## 5.35 5.35 5.21 3.08 3.02
## BsmtExposure BsmtFinType1 MasVnrType MSZoning Functional
## 3.02 2.88 1.10 0.27 0.14
## Exterior1st Exterior2nd KitchenQual LotShape LandContour
## 0.07 0.07 0.07 0.00 0.00
## LandSlope Neighborhood Condition1 BldgType HouseStyle
## 0.00 0.00 0.00 0.00 0.00
## RoofStyle RoofMatl ExterQual ExterCond Foundation
## 0.00 0.00 0.00 0.00 0.00
## HeatingQC CentralAir Electrical PavedDrive SaleCondition
## 0.00 0.00 0.00 0.00 0.00
Novamente recorremos ao caret para imputar essas categorias.
Para construir um modelo que imput a categoria vamos retirar todas as variáveis que possuam alguma porcentagem de valores nulos e deixar somente uma delas em cada modelo.
4.2 Criando os data frames
df.GarageType <- test.fac %>%
select(-GarageFinish,-GarageCond,
-BsmtExposure,-BsmtQual,
-BsmtCond,-BsmtFinType1,
-MasVnrType,
-MSZoning,-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.GarageFinish <- test.fac %>%
select(-GarageType,-GarageCond,
-BsmtExposure,-BsmtQual,
-BsmtCond,-BsmtFinType1,
-MasVnrType,
-MSZoning,-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.GarageCond <- test.fac %>%
select(-GarageType,-GarageFinish,
-BsmtExposure,-BsmtQual,
-BsmtCond,-BsmtFinType1,
-MasVnrType,
-MSZoning,-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.BsmtExposure <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtQual,
-BsmtCond,-BsmtFinType1,
-MasVnrType,
-MSZoning,-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.BsmtQual <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtCond,-BsmtFinType1,
-MasVnrType,
-MSZoning,-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.BsmtCond <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtQual,-BsmtFinType1,
-MasVnrType,
-MSZoning,-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.BsmtFinType1 <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtQual,-BsmtCond,
-MasVnrType,
-MSZoning,-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.MasVnrType <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtQual,-BsmtCond,
-BsmtFinType1,
-MSZoning,-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.MSZoning <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtQual,-BsmtCond,
-BsmtFinType1,-MasVnrType,
-Functional,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.Functional <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtQual,-BsmtCond,
-BsmtFinType1,-MasVnrType,
-MSZoning,
-Exterior1st,-Exterior2nd,
-KitchenQual)df.Exterior1st <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtQual,-BsmtCond,
-BsmtFinType1,-MasVnrType,
-MSZoning,
-Functional,-Exterior2nd,
-KitchenQual)df.Exterior2nd <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtQual,-BsmtCond,
-BsmtFinType1,-MasVnrType,
-MSZoning,-Functional,
-Exterior1st,-KitchenQual)df.KitchenQual <- test.fac %>%
select(-GarageType,-GarageFinish,
-GarageCond,-BsmtExposure,
-BsmtQual,-BsmtCond,
-BsmtFinType1,-MasVnrType,
-MSZoning,
-Functional,-Exterior1st,
-Exterior2nd)criando um vetor com o nome das variáveis e uma lista com os data-frames criados.
vars <- c('GarageType','GarageFinish',
'GarageCond','BsmtExposure',
'BsmtQual','BsmtCond',
'BsmtFinType1','MasVnrType',
'MSZoning','Functional','Exterior1st',
'Exterior2nd','KitchenQual')
list.df <- list(df.GarageType,df.GarageFinish,
df.GarageCond,df.BsmtExposure,
df.BsmtQual,df.BsmtCond,
df.BsmtFinType1,df.MasVnrType,
df.MSZoning,df.Functional,
df.Exterior1st,df.Exterior2nd,df.KitchenQual)A função abaixo automatiza o processo de impute das categoricas nos valores nulos de cada variável.
f.pred <- function(fac,df.var,rf.model,var){
new.df <- df.var[is.na(df.var[,var]),!(names(df.var) %in% var)]
pred_rf <- predict(rf.model, newdata = new.df)
fac[is.na(fac[,var]),var] <- pred_rf
return(fac)
}4.3 Usando a Caret e RandomForest
Utilizarei o random forest como classificador para imputar as categorias faltantes.
set.seed(12345)
fitControl <- trainControl(method="cv",
number=3,
savePredictions = 'final',
classProbs= F,
summaryFunction = multiClassSummary)Construindos os modelos para imputar os valores nulos.
set.seed(12345)
# Crio uma lista para armazenar os modelos
rf.list <- list()
for(j in 1:length(vars)){
# atribuo em df um df."variavel" sem os valor nulos
df <- list.df[[j]] %>% na.omit()
# treino esse df. no random-forest
rf.list[[j]] <- train(eval(parse(text = paste(vars[j],'~.'))),
data = df,
tuneLength=5,
trControl = fitControl,method='rf')
# imputo as categorias faltantes no valores nulos das variáveis
test.fac <- f.pred(test.fac,list.df[[j]],rf.list[[j]],vars[j])
cat(j,' ')
}## 1 2 3 4 5 6 7 8 9 10 11 12 13
Será há algum valor nulos ?
anyNA(test.fac)## [1] FALSE
Imputação de dados realizada com sucesso !
5 Juntandos os data frames
Jutando os dados tipo inteiros e categóricos.
df.test <- bind_cols(test.int,test.fac)
df.testAgora nosso df.test encontra-se limpo e pronto para ser explorado.
6 Exportando os dados limpos
write.csv(df.test,'../outputs/df.test.csv')